In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

The Partial Fit Interface

Out-of-core linear classification


In [ ]:
import cPickle
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=0)

for i in range(9):
    X_batch, y_batch = cPickle.load(open("data/batch_%02d.pickle" % i))
    sgd.partial_fit(X_batch, y_batch, classes=range(10))

In [ ]:
X_test, y_test = cPickle.load(open("data/batch_09.pickle"))

sgd.score(X_test, y_test)

In [ ]:
sgd = SGDClassifier()

accuracies = [0]
for i in range(9):
    X_batch, y_batch = cPickle.load(open("data/batch_%02d.pickle" % i))
    if i > 0:
        accuracies.append(sgd.score(X_batch, y_batch))
    sgd.partial_fit(X_batch, y_batch, classes=range(10))

In [ ]:
plt.plot(accuracies)
plt.xlabel("batches seen")
plt.ylabel("generalization performance")

Loading chunks using Pandas


In [ ]:
import pandas as pd
csv_iterator = pd.read_csv("data/digits.csv", chunksize=100)

sgd = SGDClassifier()

for batch in csv_iterator:
    X = batch[batch.columns[:-1]]
    y = batch[batch.columns[-1]]
    sgd.partial_fit(X_batch, y_batch, classes=range(10))

Algorithms


In [ ]:
from sklearn.utils.testing import all_estimators
for name, Class in all_estimators():
    if hasattr(Class, "partial_fit"):
        print("%s.%s" % (Class.__module__.split(".")[1], name))

Multiple iterations


In [ ]:
sgd = SGDClassifier(random_state=0)

X_test, y_test = cPickle.load(open("data/batch_09.pickle"))

accuracies = []
for iteration in range(20):
    for i in range(9):
        X_batch, y_batch = cPickle.load(open("data/batch_%02d.pickle" % i))
        sgd.partial_fit(X_batch, y_batch, classes=range(10))
    accuracies.append(sgd.score(X_test, y_test))

In [ ]:
plt.plot(accuracies)